{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "147f9480",
   "metadata": {},
   "source": [
    "## Xview Dataset Download \n",
    "\n",
    "This component is designed to download a labeled overhead image dataset, provided a chromedriver, to a specified location. \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c185c1f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc0554b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import os\n",
    "import shutil\n",
    "import time\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from urllib.parse import urlparse\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "866d16c3",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "# username for the Xview webpage to authorize login\n",
    "username = os.environ.get('username')\n",
    "\n",
    "# password for the Xview webpage to authorize login\n",
    "password = os.environ.get('password')\n",
    "\n",
    "# move_to_dir the directory where the dataset should be saved\n",
    "move_to_dir = os.environ.get('move_to_dir')\n",
    "\n",
    "# chromedriver_path the directory where the local copy of chromedriver is saved\n",
    "chromedriver_path = os.environ.get('chromedriver_path')\n",
    "\n",
    "# max_download_time before timeout, must be ajusted acording to the file size and internet speed\n",
    "max_download_time = os.environ.get('max_download_time')\n",
    "\n",
    "# The label of the file desired to download.\n",
    "# Chose from \"TI.zip\", \"TL.zip\", \"VI.zip\", \"TI.tgz\", \"TL.tgz\", \"VI.tgz, \n",
    "# standing for TI=Traning Images, TL=Training Lables, VI=Validation Images\n",
    "label = os.environ.get('label')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "794506c5",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label):  \n",
    "    \n",
    "    # Set Chrome options to automatically download files to the specified directory\n",
    "    options = webdriver.ChromeOptions()\n",
    "    prefs = {\n",
    "        \"download.default_directory\": move_to_dir,\n",
    "        \"download.prompt_for_download\": False,\n",
    "        \"download.directory_upgrade\": True,\n",
    "        \"safebrowsing.enabled\": True\n",
    "    }\n",
    "    options.add_experimental_option(\"prefs\", prefs)\n",
    "\n",
    "    # Start a new instance of Chrome web browser\n",
    "    driver = webdriver.Chrome(executable_path=chromedriver_path, options=options)\n",
    "    \n",
    "    # Open the login page\n",
    "    url_login = r'https://challenge.xviewdataset.org/login'\n",
    "    driver.get(url_login)\n",
    "\n",
    "    # Find the username and password fields and enter credentials\n",
    "    username_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'email')))\n",
    "    password_field = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.NAME, 'password')))\n",
    "    username_field.send_keys(username)\n",
    "    password_field.send_keys(password)\n",
    "\n",
    "    # Find and click the login button\n",
    "    login_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn.primary')))\n",
    "    login_button.click()\n",
    "    \n",
    "    # Wait for the page to load after login\n",
    "    time.sleep(1)\n",
    "    \n",
    "    # Open the Download page\n",
    "    url_download = r'https://challenge.xviewdataset.org/download-links'\n",
    "    driver.get(url_download)\n",
    "    \n",
    "    # Wait for the overlay element to be present\n",
    "    overlay_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, 'overlay--active')))\n",
    "\n",
    "    # Remove the automaic pop-up overlay \n",
    "    body_element = driver.find_element_by_tag_name('body')\n",
    "    body_element.click()\n",
    "    time.sleep(1)\n",
    "    \n",
    "    # Switch between the possible download files\n",
    "    search_text = \"\"\n",
    "    match label:\n",
    "        case \"TI.zip\":\n",
    "            search_text = '//a[contains(text(), \"Download Training Images (zip)\")]'\n",
    "        case \"TL.zip\":\n",
    "            search_text = '//a[contains(text(), \"Download Training Labels (zip)\")]'\n",
    "        case \"VI.zip\":\n",
    "            search_text = '//a[contains(text(), \"Download Validation Images (zip)\")]'\n",
    "        case \"TI.tgz\":\n",
    "            search_text = '//a[contains(text(), \"Download Training Images (tgz)\")]'\n",
    "        case \"TL.tgz\":\n",
    "            search_text = '//a[contains(text(), \"Download Training Labels (tgz)\")]'\n",
    "        case \"VI.tgz\":\n",
    "            search_text = '//a[contains(text(), \"Download Validation Images (tgz)\")]'\n",
    "        case _:\n",
    "            raise ValueError(\"Error: This is an invalid download option\") \n",
    "    \n",
    "    # Wait for the download link to be present\n",
    "    download_link_element = WebDriverWait(driver, 100).until(EC.presence_of_element_located((By.XPATH, search_text)))\n",
    "    \n",
    "    # Get the dynamic download link from the href attribute\n",
    "    download_link = download_link_element.get_attribute('href')\n",
    "    \n",
    "     # Download the dataset using the obtained link\n",
    "    if download_link:\n",
    "        driver.get(download_link)\n",
    "        print(\"Dataset download started successfully.\")\n",
    "        \n",
    "        # Extract the filename from the download link URL\n",
    "        parsed_url = urlparse(download_link)\n",
    "        filename = parsed_url.path.split('/')[-1]\n",
    "        downloaded_file = os.path.join(move_to_dir, filename)\n",
    "        print(downloaded_file)\n",
    "        \n",
    "        # Check if the download directory exists\n",
    "        if not os.path.exists(move_to_dir):\n",
    "            os.makedirs(move_to_dir)\n",
    "        \n",
    "        # Wait for the file to be completely downloaded\n",
    "        start_time = time.time()\n",
    "        \n",
    "        while True:\n",
    "            if os.path.exists(downloaded_file) and os.path.getsize(downloaded_file) > 0:\n",
    "                print(\"File downloaded successfully.\")\n",
    "                break\n",
    "            elif time.time() - start_time > max_download_time:\n",
    "                print(\"Error: Maximum wait time exceeded.\")\n",
    "                break\n",
    "            else:\n",
    "                time.sleep(5)\n",
    "    \n",
    "    else:\n",
    "        print(\"Failed to get the download link.\")\n",
    "\n",
    "    # Close the browser\n",
    "    driver.quit()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e7b2f96d",
   "metadata": {},
   "outputs": [],
   "source": [
    "login_and_download(username, password, move_to_dir, chromedriver_path, max_download_time, label)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
